loads libraries

library(here)
library(rjags)
library(ggplot2)
library(tidyverse)
library(gNanoPkg)

Initialise data

gNano.df = readData()

Specify where the results are stored relative to the gNano project and this file.

jamesDuncan = TRUE

resultsRoot = if(jamesDuncan){
    "../systematic/"
  }else{
    "../gNanoPkg/systematic/"
  }

Model \(g_0\) - no effects

Model $g_0$

Model \(g_0\)

Model: \[ y_{c,l,a} \sim \Gamma(r,s) \] If \[ \mathrm{E}[y_{c,l,a}]) = \log(\mu)\mbox{ and }\mathrm{Var}[y_{c,l,a}] = \sigma^2 \] then we let \[ r = \frac{\mu^2}{\sigma^2}\mbox{ and }s = \frac{\mu}{\sigma^2} \] Our prior on \(\sigma^2\) is \(inverse-\Gamma(0.001, 0.001)\), and our prior on \(\log(\mu)\) is \(N(0, 10^6)\).

First I will load up the results.

g0.df = loadResults("g-0", resultsRoot = resultsRoot)

I’ll do this with ggplot2 so I can teach myself something

p = g0.df %>% 
  ggplot(aes(x = tau)) +
  geom_density()
p

We want plots of shape and rate.

p = g0.df %>% 
  ggplot(aes(x = shape))+
  geom_density()
p

p = g0.df %>% 
  ggplot(aes(x = rate))+
  geom_density()
p

rateDensity = density(g0.df$rate)
shapeDensity = density(g0.df$shape)

rateMode = rateDensity$x[which.max(rateDensity$y)]
shapeMode = shapeDensity$x[which.max(shapeDensity$y)]

density.df = data.frame(x = seq(1, 30000, 1)) %>% 
  mutate(y = dgamma(x, rate = rateMode, shape = shapeMode))

p = gNano.df %>% 
  ggplot(aes(x = obs, y = stat(density))) + 
  geom_histogram(binwidth = 400) +
  geom_line(data = density.df, aes(x, y))
p

Plot the expected vs the observed peak heights

obsExpectPlot(g0.df, "g")

Model \(g_1\) - profile effects

Model $g_1$

Model \(g_1\)

Let’s have a look at the profile effects. I like to use error bar plots

g1.df = loadResults("g-1", resultsRoot = resultsRoot)

This code is not elegant, and is probably stupid but then so is the tidyverse a lot of the time :-)

effectsPlot(g1.df, "profile")

aph = gNano.df %>% 
  group_by(prof) %>% 
  summarise(aph = mean(log(obs), na.rm = TRUE)) %>% 
  pull(aph)


meanEffect = g1.df %>% 
  select(starts_with("beta.profile")) %>% 
  summarise_all(mean, na.rm = TRUE) %>% 
  unlist()
  
plot.df = data.frame(aph = log10(exp(aph)), meanEffect = meanEffect)
p = plot.df %>% 
  ggplot(aes(x = aph, y = meanEffect)) + 
  geom_point(show.legend = FALSE) +
  geom_smooth() + 
  xlab(expression(log[10]~(aph))) + 
  ylab(bquote(mean~tau[c])) + 
  theme(text = element_text(size = 20), aspect.ratio = 1, plot.background = element_rect(color="black"))

p

Plot the expected vs the observed peak heights

obsExpectPlot(g1.df, "g")

and a pred-res plot

obsExpectPlot(g1.df, "g", predRes = TRUE)

Model \(g_2\) - locus effects

Model $g_2$

Model \(g_2\)

g2.df = loadResults("g-2", resultsRoot = resultsRoot)

effectsPlot(g2.df, "locus")

Plot the expected vs observed peak heights

obsExpectPlot(g2.df, "g")
## Warning: Removed 348 rows containing non-finite values (stat_smooth).

and the pred-res plot

obsExpectPlot(g2.df, "g", TRUE)

Plot the locus effects by Average Peak Height:

aph = gNano.df %>% 
  group_by(loc) %>% 
  summarise(aph = mean(log(obs), na.rm = TRUE)) %>% 
  pull(aph)


meanEffect = g2.df %>% 
  select(starts_with("alpha.locus")) %>% 
  summarise_all(mean, na.rm = TRUE) %>% 
  unlist()
  
plot.df = data.frame(aph = log10(exp(aph)), meanEffect = meanEffect)
p = plot.df %>% 
  ggplot(aes(x = aph, y = meanEffect)) + 
  geom_point(show.legend = FALSE) +
  geom_smooth() + 
  xlab(expression(log[10]~(aph))) + 
  ylab(bquote(mean~alpha[l])) + 
  theme(text = element_text(size = 20), aspect.ratio = 1, plot.background = element_rect(color="black"))
p

Model \(g_3\) - all other effects except extra variance

Model $g_2$

Model \(g_2\)

Dye effects

g3.df = loadResults("g-3", resultsRoot = resultsRoot)

effectsPlot(g3.df, "dye")

Plot the dye effects by Average Peak Height - no fitted line here because with four points who cares:

aph = gNano.df %>% 
  group_by(dye) %>% 
  summarise(aph = mean(log(obs), na.rm = TRUE)) %>% 
  pull(aph)


meanEffect = g3.df %>% 
  select(starts_with("gamma.dye")) %>% 
  summarise_all(mean, na.rm = TRUE) %>% 
  unlist()
  
plot.df = data.frame(aph = log10(exp(aph)), meanEffect = meanEffect)
p = plot.df %>% 
  ggplot(aes(x = aph, y = meanEffect)) + 
  geom_point(show.legend = FALSE) +
  geom_smooth() + 
  xlab(expression(log[10]~(aph))) + 
  ylab(bquote(mean~delta[f])) + 
  theme(text = element_text(size = 20), aspect.ratio = 1, plot.background = element_rect(color="black"))
p
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3.296
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.095418
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.036101
## Warning in sqrt(sum.squares/one.delta): NaNs produced
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small.
## fewer data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 3.296
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 0.095418
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 0.036101
## Warning in stats::qt(level/2 + 0.5, pred$df): NaNs produced

Plot the expected vs observed peak heights

obsExpectPlot(g3.df, "g")
## Warning: Removed 19 rows containing non-finite values (stat_smooth).
## Warning: Removed 569 rows containing non-finite values (stat_smooth).

and the pred-res plot

g3.predRes = obsExpectPlot(g3.df, "g", TRUE)

Model \(g_4\) - extra variance

Is the added variance being used?

g4.df = loadResults("g-4", resultsRoot = resultsRoot)

g4.df = g4.df %>% 
  mutate(sigma.sq0 = 1 / tau0)

p = g4.df %>% 
  ggplot(aes(x = sigma.sq0/10^6)) +
  geom_density() +
  geom_vline(xintercept = quantile(g4.df$sigma.sq0/10^6, probs=0.025)) +
  geom_vline(xintercept = quantile(g4.df$sigma.sq0/10^6, probs=0.975)) +
  xlab(bquote(sigma[0]^2~(x10^6))) + 
  theme(text = element_text(size = 20), aspect.ratio = 1, plot.background = element_rect(color="black"))
p

Plot the expected vs observed peak heights

obsExpectPlot(g4.df, "g")
## Warning: Removed 19 rows containing non-finite values (stat_smooth).
## Warning: Removed 589 rows containing non-finite values (stat_smooth).

And the pred-res plot

g4.predRes = obsExpectPlot(g4.df, "g", TRUE)

Is it doing anything?

plot.df = data.frame(
  expected.g3 = g3.predRes$plot.df$mean,
  expected.g4 = g4.predRes$plot.df$mean
)
p = plot.df %>% 
  ggplot(aes(x = expected.g3, y = expected.g4)) +
           geom_point() +
           geom_abline(aes(intercept = 0, slope = 1)) + 
          xlim(1, 5) +
          ylim(1, 5)
p

I think it’s doing what it is supposed to.

Let’s try and look at the expected values slightly differently. I will work with the summary stats rather than the raw data

simSummary = loadResults("g-3", resultsRoot = resultsRoot, summary = TRUE)

g3.quantiles = as.data.frame(t(simSummary$quantiles)) %>% 
  select(starts_with("pred"))  %>%
   rownames_to_column %>% 
   gather(var, value, -rowname) %>% 
   spread(rowname, value) %>% 
  mutate(var = as.numeric(gsub("^pred\\[([0-9]+)\\]$", "\\1", var))) %>% 
  arrange(var)

g3.stats = as.data.frame(t(simSummary$statistics)) %>% 
  select(starts_with("pred"))  %>%
   rownames_to_column %>% 
   gather(var, value, -rowname) %>% 
   spread(rowname, value) %>% 
  mutate(var = as.numeric(gsub("^pred\\[([0-9]+)\\]$", "\\1", var))) %>% 
  arrange(var)



simSummary = loadResults("g-4", resultsRoot = resultsRoot, summary = TRUE)

g4.quantiles = as.data.frame(t(simSummary$quantiles)) %>% 
    select(starts_with("pred"))  %>%
   rownames_to_column %>% 
   gather(var, value, -rowname) %>% 
   spread(rowname, value)  %>% 
  mutate(var = as.numeric(gsub("^pred\\[([0-9]+)\\]$", "\\1", var))) %>% 
  arrange(var)

plot.df = bind_rows(g3.quantiles, g4.quantiles) %>% 
  mutate(model = rep(c("g3", "g4"), rep(nrow(g3.quantiles), 2))) %>% 
  mutate(obs = rep(gNano.df$obs, 2)) %>% 
  rename(med = `50%`, lb = `2.5%`, ub = `97.5%`) %>% 
  select(obs, model, lb, med, ub) %>% 
  arrange(obs)


p = plot.df %>% 
  ggplot(aes(x = log10(obs), y = log10(med))) +
  facet_grid(vars(model)) +
    geom_abline(aes(intercept = 0, slope = 1)) +
    xlim(1, 5) +
    ylim(1, 5) +
  geom_point(show.legend = FALSE) + 
  geom_smooth() +
  geom_line(aes(y = log10(lb)), col = "green") + geom_line(aes(y = log10(ub)), col = "red")
p
## Warning: Removed 38 rows containing non-finite values (stat_smooth).
## Warning: Removed 38 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing missing values (geom_path).

Now we move on the the log normal models

Model \(ln_0\) - no effects

Tau

ln0.df = loadResults("ln-0", resultsRoot = resultsRoot)

p = ln0.df %>% 
  ggplot(aes(x = tau)) +
  geom_density()
p

Mu

p = ln0.df %>% 
  ggplot(aes(x = Mu)) +
  geom_density()
p

Comparing expected and observed peak heights

obsExpectPlot(ln0.df, "ln")

and the pred-res plot

obsExpectPlot(ln0.df, "ln", TRUE)

Model \(ln_1\) - profile effects

Load the data

ln1.df = loadResults("ln-1", resultsRoot = resultsRoot)

Showing the per profile effects

effectsPlot(ln1.df, "profile")

Now showing the relationship between profile aph and profile effects from model

aph = gNano.df %>% 
  group_by(prof) %>% 
  summarise(aph = mean(log(obs), na.rm = TRUE)) %>% 
  pull(aph)


meanEffect = ln1.df %>% 
  select(starts_with("beta.profile")) %>% 
  summarise_all(mean, na.rm = TRUE) %>% 
  unlist()
  
plot.df = data.frame(aph = log10(exp(aph)), meanEffect = meanEffect)
p = plot.df %>% 
  ggplot(aes(x = aph, y = meanEffect)) + 
  geom_point(show.legend = FALSE) +
  geom_smooth() +
  xlab(expression(log[10]~(aph))) + 
  ylab(bquote(mean~tau[c])) + 
  theme(text = element_text(size = 20), aspect.ratio = 1, plot.background = element_rect(color="black"))
p

Plotting observed vs expected peak heights

obsExpectPlot(ln1.df, "ln")

and the pred-res plot

obsExpectPlot(ln1.df, "ln", TRUE)

## Model \(ln_2\) - locus effects

ln2.df = loadResults("ln-2", resultsRoot = resultsRoot)

effectsPlot(ln2.df, "locus")

Plot the expected vs observed peak heights

obsExpectPlot(ln2.df, "ln")

and the pred-res

obsExpectPlot(ln2.df, "ln", TRUE)

Plot the locus effects by Average Peak Height:

aph = gNano.df %>% 
  group_by(loc) %>% 
  summarise(aph = mean(log(obs), na.rm = TRUE)) %>% 
  pull(aph)


meanEffect = ln2.df %>% 
  select(starts_with("alpha.locus")) %>% 
  summarise_all(mean, na.rm = TRUE) %>% 
  unlist()
  
plot.df = data.frame(aph = log10(exp(aph)), meanEffect = meanEffect)
p = plot.df %>% 
  ggplot(aes(x = aph, y = meanEffect)) + 
  geom_point(show.legend = FALSE) +
  geom_smooth() +
  xlab(expression(log[10]~(aph))) + 
  ylab(bquote(mean~alpha[l])) + 
  theme(text = element_text(size = 20), aspect.ratio = 1, plot.background = element_rect(color="black"))
p

Model \(ln_3\) - all other effects except extra variance

Dye effects

ln3.df = loadResults("ln-3", resultsRoot = resultsRoot)
effectsPlot(ln3.df, "dye")

Plot the dye effects by Average Peak Height - no fitted line here because with four points who cares:

aph = gNano.df %>% 
  group_by(dye) %>% 
  summarise(aph = mean(log(obs), na.rm = TRUE)) %>% 
  pull(aph)


meanEffect = ln3.df %>% 
  select(starts_with("gamma.dye")) %>% 
  summarise_all(mean, na.rm = TRUE) %>% 
  unlist()
  
plot.df = data.frame(aph = log10(exp(aph)), meanEffect = meanEffect)
p = plot.df %>% 
  ggplot(aes(x = aph, y = meanEffect)) + 
  geom_point(show.legend = FALSE) +
  geom_smooth() +
  xlab(expression(log[10]~(aph))) + 
  ylab(bquote(mean~delta[f])) + 
  theme(text = element_text(size = 20), aspect.ratio = 1, plot.background = element_rect(color="black"))
p
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 3.296
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.095418
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.036101
## Warning in sqrt(sum.squares/one.delta): NaNs produced
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small.
## fewer data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at 3.296
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 0.095418
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 0.036101
## Warning in stats::qt(level/2 + 0.5, pred$df): NaNs produced

Plot the expected vs observed peak heights

obsExpectPlot(ln3.df, "ln")

and the pred-res

obsExpectPlot(ln3.df, "ln", TRUE)

Model \(ln_4\) - extra variance

Is the added variance being used?

ln4.df = loadResults("ln-4", resultsRoot = resultsRoot)

ln4.df = ln4.df %>% 
  mutate(sigma.sq0 = 1 / tau0)

p = ln4.df %>% 
  ggplot(aes(x = sigma.sq0)) +
  geom_density() +
  geom_vline(xintercept = quantile(ln4.df$sigma.sq0, probs=0.025)) +
  geom_vline(xintercept = quantile(ln4.df$sigma.sq0, probs=0.975)) +
  xlab(bquote(sigma[0]^2)) + 
  theme(text = element_text(size = 20), aspect.ratio = 1, plot.background = element_rect(color="black"))
p

Plot the expected vs observed peak heights

obsExpectPlot(ln4.df, "ln")

And the pred-res

obsExpectPlot(ln4.df, "ln", TRUE)

Compare the dlog(likelihoods) for the log-normal model 4 vs the gamma model 4

ln4.ll = calcLogLik(ln4.df, "ln")
g4.ll = calcLogLik(g4.df, "g")
compareLL(ln4.ll, g4.ll, labels = c("LN-4", "G-4"))

Comparing the g3 model and the g4 model

g3.ll = calcLogLik(g3.df, "g")
compareLL(g3.ll, g4.ll, labels = c("G-3", "G-4"))

Compare the ln3 and ln4 models

ln3.ll = calcLogLik(ln3.df, "ln")
## Warning: NAs introduced by coercion
compareLL(ln3.ll, ln4.ll, labels = c("LN-3", "LN-4"))

Compare all the ln models at once:

ln0.ll = calcLogLik(ln0.df, "ln")
ln1.ll = calcLogLik(ln1.df, "ln")
## Warning: NAs introduced by coercion
ln2.ll = calcLogLik(ln2.df, "ln")
## Warning: NAs introduced by coercion
compareLL(ln0.ll, ln1.ll, ln2.ll, ln3.ll, ln4.ll,
          labels = paste0("LN-", 0:4))

LN-0 kind of distorts this so, let’s drop it.

compareLL(ln1.ll, ln2.ll, ln3.ll, ln4.ll,
          labels = paste0("LN-", 1:4))

So no real value in the variance model. Definite gains in adding in dye and locus effects (and profile, but we don’t care about those).

Does the removal of sum constraints matter?

ln5.df = loadResults("ln-5", resultsRoot = resultsRoot)
effectsPlot(ln5.df, "profile")

So this is interesting in that all the profile effects are non-zero. Let’s just look at ln4

effectsPlot(ln4.df, "profile")

effectsPlot(ln5.df, "locus")

Hmm - similar

effectsPlot(ln5.df, "dye")

And this too - all positive

obsExpectPlot(ln5.df, "ln")

No real change here in terms of predicted values

obsExpectPlot(ln5.df, "ln", TRUE)

What about change to the log-likelihood?

ln5.ll = calcLogLik(ln5.df, "ln")
compareLL(ln4.ll, ln5.ll, labels = c("LN-4", "LN-5"))

I think we’d say realistically no. So are the sum constraints being swallowed in the mean?

p = ln5.df %>% 
  ggplot(aes(x = Mu)) +
  geom_density()
p

And I’d say the answer is “yes.” I think we should probably just stick with the constraints. I’m happier with them being there than not, and they make more sense.